import pandas as pd
#create dataframe from scratch
raw_data = {
"name": ["Ben","Bam","Boy","Bas","Boss"],
"age":[33,20,31,43,21],
"gender":["M","F","M","M","M"]
}
df = pd.DataFrame(raw_data)
df
df["city"] = ['London','London','London','Liverpool','Bristol']
df.shape
(5, 4)
# Drop column city
df.drop('city',axis = 1) #drop column
df.drop(2,axis=0) #drop row
# reset index
df = df.reset_index(drop = True)
df
# column name
list(df.columns)
# rename column
df.columns = ['nickname','age','sex','town']
# create a new series
s1 = pd.Series(['base',15,'F','BKK'],index=['nickname','age','sex','town'])
print(s1)
print(type(s1))
nickname base
age 15
sex F
town BKK
dtype: object
<class 'pandas.core.series.Series'>
age 15
sex F
town BKK
dtype: object
<class 'pandas.core.series.Series'>
# append series in record
df = df.append(s1,ignore_index=True)
df
# append series with new dimension
s2 = pd.Series(['England','England','England','England','England','Thailand'])
df['Nation'] = s2
df
# write csv file
df.to_csv('mydata.csv')
# import csv file
df2 = pd.read_csv('Data/demo_export.csv')
df2
#import xlsx file
df3 = pd.read_excel('Data/msa.xlsx')
df3
#import json file
df4 = pd.read_json('Data/data.json')
df4
penguins = pd.read_csv('Data/penguins.csv')
#preview head
penguins.head()
#preview tail
penguins.tail()
# shape (attribute)
penguins.shape
(344, 7)
# information
penguins.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 species 344 non-null object
1 island 344 non-null object
2 bill_length_mm 342 non-null float64
3 bill_depth_mm 342 non-null float64
4 flipper_length_mm 342 non-null float64
5 body_mass_g 342 non-null float64
6 sex 333 non-null object
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 species 344 non-null object
1 island 344 non-null object
2 bill_length_mm 342 non-null float64
3 bill_depth_mm 342 non-null float64
4 flipper_length_mm 342 non-null float64
5 body_mass_g 342 non-null float64
6 sex 333 non-null object
dtypes: float64(4), object(3)
memory usage: 18.9+ KB
#select column single column
penguins['species']
penguins.species.head
<bound method NDFrame.head of 0 Adelie
1 Adelie
2 Adelie
3 Adelie
4 Adelie
...
339 Gentoo
340 Gentoo
341 Gentoo
342 Gentoo
343 Gentoo
Name: species, Length: 344, dtype: object>
1 Adelie
2 Adelie
3 Adelie
4 Adelie
...
339 Gentoo
340 Gentoo
341 Gentoo
342 Gentoo
343 Gentoo
Name: species, Length: 344, dtype: object>
penguins[['species','island','sex']].head(3)
#integer location base indexing(iloc)
penguins.iloc[[0,1,2],[0,2]]
#filters row by a condition
penguins[penguins['island'] == 'Torgersen']
penguins[penguins['bill_length_mm']<34]
# filter more than one criteria/condition with and
penguins[(penguins['island']=='Torgersen') & (penguins['bill_length_mm']<35)]
# with or
penguins[(penguins['island']=='Torgersen') & (penguins['bill_length_mm']<35)]
# filter with query (method)
penguins.query('island == "Torgersen"')
# more condition
penguins.query('island =="Torgersen" | bill_length_mm <35 ')
#check missing value in each column
penguins.isna().sum()
#filter missing value with Nan
penguins[penguins['sex'].isna()]
#drop NA
clean_penguins = penguins.dropna()
clean_penguins.isna().sum()
#sort single column
penguins.sort_values('bill_length_mm',ascending=False)
#sort multiple columns
penguins.sort_values(['island','bill_length_mm'])
#unique values
penguins['island'].unique()
#count value
penguins['island'].value_counts()
#count more than one value
penguins[['island','species']].value_counts().reset_index()
#summarise dataframe
penguins.describe()
#summarise df with all column (include text data)
penguins.describe(include='all')
#summarise just average
penguins['bill_length_mm'].mean()
penguins['bill_length_mm'].std()
penguins['bill_length_mm'].median()
44.45
#group by + sum/mean
penguins.groupby('species')['bill_length_mm'].mean()
#groupby aggregation
penguins.groupby('species')['bill_length_mm'].agg(['min','max','median','mean','std'])
#groupby more than one column
result = penguins.groupby(['island','species'])['bill_length_mm'].agg(['min','max'])
#reset index
result = result.reset_index()
result
#write to csv
result.to_csv('result_penguin.csv')
# if your code lond with \
result = penguins.groupby(['island','species'])['bill_length_mm']\
.agg(['min','max'])
result
#map value Male : m , Female : f
penguins['sex'].map({'MALE':'m','FEMALE':'f'}).fillna('other')
#numpy
import numpy as np
np.mean(penguins['bill_length_mm'])
43.9219298245614
# where with numpy
score = pd.Series([80,92,95,70,64])
grade = np.where(score>=80,"pass","failed")
print(grade)
['pass' 'pass' 'pass' 'failed' 'failed']
df = penguins.query("species == 'Adelie'")[['species','island','bill_length_mm']].dropna()
df['new_column'] = np.where(df['bill_length_mm']>40,"true","false")
left = {
'key':[1,2,3,4],
'name':['toy','ben','cat','john'],
'age':[22,31,42,54]
}
right = {
'key':[1,2,3,4],
'city':['BKK','London','New york','Tokyo'],
'zip':[1001,2032,3452,2314]
}
df_left = pd.DataFrame(left)
df_right = pd.DataFrame(right)
dfmix = pd.merge(df_left,df_right,on='key')
dfmix
#pandas plot
#histrogram
penguins['body_mass_g'].plot(kind='hist'); #one dimension
penguins[['body_mass_g','bill_length_mm']].plot(kind='hist',bins=30,color='orange'); #two dimension
#bar plot
penguins['species'].value_counts().plot(kind = 'bar',color=['orange','salmon','gold'])
<AxesSubplot:>
#scatter plot
penguins[['bill_length_mm','bill_depth_mm']].plot(x='bill_length_mm',y='bill_depth_mm',kind='scatter',color = 'purple')
<AxesSubplot:xlabel='bill_length_mm', ylabel='bill_depth_mm'>
penguins